1   import java.util.regex.*;
2   import java.util.*;
3   import java.io.*;
4   
5   public class CharacterScript {
6   
7       // generate the code needed for j.l.C.UnicodeScript
8       static void fortest(String fmt, Object... o) {
9           //System.out.printf(fmt, o);
10      }
11  
12      static void print(String fmt, Object... o) {
13          System.out.printf(fmt, o);
14      }
15  
16      static void debug(String fmt, Object... o) {
17          //System.out.printf(fmt, o);
18      }
19  
20      public static void main(String args[]){
21          try {
22              if (args.length != 1) {
23                  System.out.println("java CharacterScript script.txt out");
24                  System.exit(1);
25              }
26  
27              int i, j;
28              BufferedReader sbfr = new BufferedReader(new FileReader(args[0]));
29              HashMap<String,Integer> scriptMap = new HashMap<String,Integer>();
30              String line = null;
31  
32              Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s+;\\s+(\\w+)\\s+#.*").matcher("");
33  
34              int prevS = -1;
35              int prevE = -1;
36              String prevN = null;
37              int[][] scripts = new int[1024][3];
38              int scriptSize = 0;
39  
40              while ((line = sbfr.readLine()) != null) {
41                  if (line.length() <= 1 || line.charAt(0) == '#') {
42                      continue;
43                  }
44                  m.reset(line);
45                  if (m.matches()) {
46                      int start = Integer.parseInt(m.group(1), 16);
47                      int end = (m.group(2)==null)?start
48                                :Integer.parseInt(m.group(2), 16);
49                      String name = m.group(3);
50                      if (name.equals(prevN) && start == prevE + 1) {
51                          prevE = end;
52                      } else {
53                          if (prevS != -1) {
54                              if (scriptMap.get(prevN) == null) {
55                                  scriptMap.put(prevN, scriptMap.size());
56                              }
57                              scripts[scriptSize][0] = prevS;
58                              scripts[scriptSize][1] = prevE;
59                              scripts[scriptSize][2] = scriptMap.get(prevN);
60                              scriptSize++;
61                          }
62                          debug("%x-%x\t%s%n", prevS, prevE, prevN);
63                          prevS = start; prevE = end; prevN = name;
64                      }
65                  } else {
66                      debug("Warning: Unrecognized line <%s>%n", line);
67                  }
68              }
69  
70              //last one.
71              if (scriptMap.get(prevN) == null) {
72                  scriptMap.put(prevN, scriptMap.size());
73              }
74              scripts[scriptSize][0] = prevS;
75              scripts[scriptSize][1] = prevE;
76              scripts[scriptSize][2] = scriptMap.get(prevN);
77              scriptSize++;
78  
79              debug("%x-%x\t%s%n", prevS, prevE, prevN);
80              debug("-----------------%n");
81              debug("Total scripts=%s%n", scriptMap.size());
82              debug("-----------------%n%n");
83  
84              String[] names = new String[scriptMap.size()];
85              for (String name: scriptMap.keySet()) {
86                  names[scriptMap.get(name).intValue()] = name;
87              }
88  
89              for (j = 0; j < scriptSize; j++) {
90                  for (int cp = scripts[j][0]; cp <= scripts[j][1]; cp++) {
91                      String name = names[scripts[j][2]].toUpperCase(Locale.ENGLISH);;
92                      if (cp > 0xffff)
93                          System.out.printf("%05X    %s%n", cp, name);
94                      else
95                          System.out.printf("%05X    %s%n", cp, name);
96                  }
97              }
98  
99              Arrays.sort(scripts, 0, scriptSize,
100                         new Comparator<int[]>() {
101                             public int compare(int[] a1, int[] a2) {
102                                 return a1[0] - a2[0];
103                             }
104                             public boolean compare(Object obj) {
105                                 return obj == this;
106                             }
107                          });
108 
109 
110 
111             // Consolidation: there are lots of "reserved" code points
112             // embedded in those otherwise "sequential" blocks.
113             // To make the lookup table smaller, we combine those
114             // separated segments with the assumption that the lookup
115             // implementation checks
116             //    Character.getType() !=  Character.UNASSIGNED
117             // first (return UNKNOWN for unassigned)
118 
119             ArrayList<int[]> list = new ArrayList();
120             list.add(scripts[0]);
121 
122             int[] last = scripts[0];
123             for (i = 1; i < scriptSize; i++) {
124                 if (scripts[i][0] != (last[1] + 1)) {
125 
126                     boolean isNotUnassigned = false;
127                     for (int cp = last[1] + 1; cp < scripts[i][0]; cp++) {
128                         if (Character.getType(cp) != Character.UNASSIGNED) {
129                             isNotUnassigned = true;
130                             debug("Warning: [%x] is ASSIGNED but in NON script%n", cp);
131                             break;
132                         }
133                     }
134                     if (isNotUnassigned) {
135                         // surrogates only?
136                         int[] a = new int[3];
137                         a[0] = last[1] + 1;
138                         a[1] = scripts[i][0] - 1;
139                         a[2] = -1;  // unknown
140                         list.add(a);
141                     } else {
142                         if (last[2] == scripts[i][2]) {
143                             //combine
144                             last[1] = scripts[i][1];
145                             continue;
146                         } else {
147                             // expand last
148                             last[1] = scripts[i][0] - 1;
149                         }
150                     }
151                 }
152                 list.add(scripts[i]);
153                 last = scripts[i];
154             }
155 
156             for (i = 0; i < list.size(); i++) {
157                 int[] a = (int[])list.get(i);
158                 String name = "UNKNOWN";
159                 if (a[2] != -1)
160                     name = names[a[2]].toUpperCase(Locale.US);
161                 debug("0x%05x, 0x%05x  %s%n", a[0], a[1], name);
162             }
163             debug("--->total=%d%n", list.size());
164 
165 
166             //////////////////OUTPUT//////////////////////////////////
167             print("public class Scripts {%n%n");
168             print("    public static enum UnicodeScript {%n");
169             for (i = 0; i < names.length; i++) {
170                 print("        /**%n         * Unicode script \"%s\".%n         */%n", names[i]);
171                 print("        %s,%n%n",  names[i].toUpperCase(Locale.US));
172             }
173             print("        /**%n         * Unicode script \"Unknown\".%n         */%n        UNKNOWN;%n%n");
174 
175 
176             // lookup table
177             print("        private static final int[] scriptStarts = {%n");
178             for (int[] a : list) {
179                 String name = "UNKNOWN";
180                 if (a[2] != -1)
181                     name = names[a[2]].toUpperCase(Locale.US);
182                 if (a[0] < 0x10000)
183                     print("            0x%04X,   // %04X..%04X; %s%n",
184                           a[0], a[0], a[1], name);
185                 else
186                     print("            0x%05X,  // %05X..%05X; %s%n",
187                           a[0], a[0], a[1], name);
188             }
189             last = list.get(list.size() -1);
190             if (last[1] != Character.MAX_CODE_POINT)
191                 print("            0x%05X   // %05X..%06X; %s%n",
192                       last[1] + 1, last[1] + 1, Character.MAX_CODE_POINT,
193                       "UNKNOWN");
194             print("%n        };%n%n");
195 
196             print("        private static final UnicodeScript[] scripts = {%n");
197             for (int[] a : list) {
198                 String name = "UNKNOWN";
199                 if (a[2] != -1)
200                     name = names[a[2]].toUpperCase(Locale.US);
201                 print("            %s,%n", name);
202             }
203 
204             if (last[1] != Character.MAX_CODE_POINT)
205                 print("            UNKNOWN%n");
206             print("        };%n");
207             print("    }%n");
208             print("}%n");
209 
210         } catch (Exception e) {
211             e.printStackTrace();
212         }
213     }
214 }